In [553]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
%load_ext memory_profiler
from IPython.display import Image, HTML
import datetime as dt
from hdbscan import HDBSCAN
from itertools import cycle
import math
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mpd
from mpl_toolkits.mplot3d import Axes3D
import numexpr as ne
import numpy as np
import pandas as pd
import pysolar
import pytz
import random
import scipy
from scipy.signal import medfilt, wiener
import scipy.signal as signal
import seaborn as sns
import sklearn
import sklearn.metrics as metrics
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from enerpi.base import timeit
from enerpi.api import enerpi_data_catalog
from enerpi.process_ldr import pca_clustering, plot_clustering, plot_clustering_as_voronoi, plot_clustering_figure, tableau20
#from enerpi.enerplot import plot_tile_last_24h, plot_power_consumption_hourly
from enerpi.sunposition import sun_position
from prettyprinting import *
# Catálogo y lectura de todos los datos.
@timeit('LOAD ALL DATA', verbose=True)
def load_data():
cat = enerpi_data_catalog()
data, data_s = cat.get_all_data()
print_info(data_s.tail())
POWER = pd.DataFrame(data.power).tz_localize(TZ)
print_cyan(POWER.describe().T.astype(int))
homog_power = POWER.resample('1s').mean().fillna(method='bfill', limit=3).fillna(-1) #.round().astype('int16')
return data, data_s, POWER, homog_power
TZ = pytz.timezone('Europe/Madrid')
FS = (16, 10)
sns.set_style('ticks')
pd.set_option('display.width', 120)
cm = cycle(tableau20[2::2])
data, data_s, POWER, homog_power = load_data()
# Some Sampling:
power_day13 = homog_power.loc['2016-08-13']
power_day14 = homog_power.loc['2016-08-14']
power_standby = homog_power.loc['2016-08-29']
In [ ]:
In [ ]:
In [191]:
def process_fft(serie_tt, with_freq=True):
# Scipy Real FFT
z_pure = scipy.fftpack.rfft(serie_tt - serie_tt.mean()) / len(serie_tt)
if z_pure.shape[0] % 2 == 0: # N es par
mags = np.concatenate([[abs(z_pure[0])], np.sqrt(z_pure[1:-1:2]**2 + z_pure[2::2]**2), [abs(z_pure[-1])]])
else:
mags = np.concatenate([[abs(z_pure[0])], np.sqrt(z_pure[1::2]**2 + z_pure[2::2]**2)])
if with_freq:
y = scipy.fftpack.rfftfreq(len(serie_tt), d=(serie_tt.index[1] - serie_tt.index[0]).total_seconds())
freqs = np.concatenate((np.zeros(1), y[1::2]))
return freqs, mags
return mags
@timeit('slice_window_fft', verbose=True)
def _slice_window_fft(serie_tt, size_w, frac_solape=.5, only_complete=True):
# %timeit _slice_window_fft(power_standby, 14400, frac_solape=.75)
# 100 loops, best of 3: 10.4 ms per loop
N = len(serie_tt)
start = serie_tt.index[0]
end = serie_tt.index[-1]
T = end - start
Ts = serie_tt.index[1] - start
delta = pd.Timedelta(seconds=int(round(((1 - frac_solape) * size_w * Ts).total_seconds())))
# print(delta, (1 - frac_solape) * size_w * Ts)
n = (T - size_w * Ts) / delta + 1
results, freqs, init = [], None, False
while start < end:
data_w = serie_tt.loc[start:start + size_w * Ts - Ts]
if only_complete and (data_w.shape[0] == size_w):
if not init:
freqs, mags = process_fft(data_w, with_freq=True)
init = True
else:
mags = process_fft(data_w, with_freq=False)
results.append((start, mags))
#else:
# print('No se procesa ts={} (shape={} != {})'.format(start, data_w.shape, size_w))
start += delta
return freqs, results
@timeit('fft_window', verbose=True)
def fft_window(serie_tt, delta='3h', step='15min'):
window_delta = pd.Timedelta(delta)
window_step = pd.Timedelta(step)
fr_s = 1 - window_step / window_delta
window_size = window_delta / serie_tt.index.freq
print_cyan('Data lenght: {}; window size: {:.0f}, step: {}, (solape={:.3f})'
.format(len(serie_tt), window_size, window_step, fr_s))
freqs, results = _slice_window_fft(serie_tt, window_size, fr_s)
return pd.DataFrame([pd.Series(data=r[1], index=freqs, name=r[0]) for r in results]).T
@timeit('plot_tt_fft', verbose=True)
def plot_tt_fft(serie_tt,
freqs=None, mags=None,
max_freq=.1, best_freqs=5,
axes=None, figsize=(8, 8),
params_plot_fft={}, params_plot_t={}, **params_window_fft):
# PLOT tt + fft
if axes is None:
fig, axes = plt.subplots(2, 1, figsize=figsize,
gridspec_kw=dict(hspace=0, wspace=0, height_ratios=[.45, .55]))
ax = axes[0]
ax.xaxis.tick_top()
d_plot_t = dict(c=tableau20[4], lw=1, alpha=.8)
d_plot_t.update(params_plot_t)
serie_tt.tz_localize(None).plot(ax=ax, **d_plot_t)
#ax.xaxis.set_major_formatter(mpd.DateFormatter('%H:%M:%S'))
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha='center')
ax.xaxis.set_tick_params(labelsize=8, pad=2)
ax.set_ylabel('(Power) time series', labelpad=3)
ax.grid('on', axis='x')
ax.set_yticks(ax.get_yticks()[1:])
# FFT plot
ax = axes[1]
ax.xaxis.tick_bottom()
if (mags is None) and len(params_window_fft) > 0:
df_fft_w = fft_window(serie_tt, **params_window_fft)
d_plot = dict(lw=.75, c='k', alpha=max(.05, 1/len(df_fft_w.columns)))
d_plot.update(params_plot_fft)
df_fft_w.plot(ax=ax, legend='', **d_plot)
df_fft_w.median(axis=1).plot(ax=ax, lw=.75, c=tableau20[2], alpha=.8)
ax.set_xlim((0, min(max_freq, df_fft_w.max(axis=1).sort_values(ascending=False).index[0] * 10)))
else:
if mags is None:
freqs, mags = process_fft(serie_tt, with_freq=True)
d_plot = dict(c=tableau20[0], lw=1.25, alpha=.8)
d_plot.update(params_plot_fft)
ax.plot(freqs, mags, **d_plot)
if best_freqs > 0:
idx_max = np.argsort(mags)[-best_freqs:][::-1]
mag_max = mags.max()
for i, (f, mag) in enumerate(zip(freqs[idx_max], mags[idx_max])):
fr = mag / mag_max
ax.plot([f], [mag], 'o', markersize=10, markeredgewidth=1.5, markeredgecolor=tableau20[2], markerfacecolor=[0, 0, 0, 0])
ax.annotate('f=1/{:.1f}, A={:.2g}'.format(1 / f, mag / mag_max), (f, mag),
xytext=(.97, .92 - i * .075), textcoords='axes fraction',
ha='right', va='bottom', fontsize=8, color=tableau20[2],
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.05"))
ax.set_xlim((0, min(max_freq, freqs[np.argmax(mags)] * 10)))
ax.set_xlabel('Freq (Hz)', labelpad=1)
ax.set_ylabel('Mag (√|mag|)', labelpad=3)
#ax.set_xlim((2/N, (1/Ts)/2))
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha='center')
ax.xaxis.set_tick_params(labelsize=9, pad=3)
#ax.set_yticks(ax.get_yticks()[:-1])
return axes
def _gen_ts_sinusoidal(Ts=1, tf=3600,
freqs=[1 / 300, 1 / 225, 1 / 60, 1/20],
amps=[3, 7, 2, .5],
offsets=[3, 7, -5, .15],
mean=30):
N = tf // Ts
tt = pd.DatetimeIndex(freq=pd.Timedelta(seconds=Ts), start=pd.Timestamp.now(tz=TZ).replace(microsecond=0),
periods=N)
t = np.linspace(0, tf, N)
x = mean #+ np.random.random(N) * 0.3
for a, f, o in zip(amps, freqs, offsets):
x += a * np.cos(2*np.pi*f*t + o)
return tt, x
def _show_info_clustering(X, labels, labels_true=None, with_sihouette_coef=False):
n_clusters_ = len(set(labels))
print_magenta('Estimated number of clusters: %d' % n_clusters_)
if labels_true is not None:
print_ok("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print_ok("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print_ok("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print_ok("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print_ok("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
if with_sihouette_coef:
print_info("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
print_cyan(pd.DataFrame(pd.Series(labels).value_counts().rename('Label members')).T)
def _plot_reduced_clustering(X, labels=None, axes3d=False, figsize=(18, 5), s_resid=1, cm=None, **kwargs_plot):
d_params = dict(alpha=.1, lw=0, s=7)
d_params.update(kwargs_plot)
if cm is None:
#cm = cycle(tableau20[::2] + tableau20[1::2])
cm = cycle(tableau20[::2])
if labels is not None:
labels_unique = set(labels)
groups = [X[labels == k] for k in labels_unique]
colors = [next(cm) if k >= 0 else 'k' for k in labels_unique]
s = d_params.pop('s')
sizes = [s if k >= 0 else s_resid for k in labels_unique]
else:
groups = [X]
colors = ['k']
sizes = [d_params.pop('s')]
fig = plt.figure(figsize=(18, 5))
if X.shape[1] > 2:
if axes3d:
ax = fig.add_subplot(111, projection='3d')
for x, c, s in zip(groups, colors, sizes):
d_params['color'] = c
ax.scatter(x[:,0], x[:,1], x[:,2], s=s, **d_params)
else:
ax1 = plt.subplot(131)
ax2 = plt.subplot(132)
ax3 = plt.subplot(133)
for x, c, s in zip(groups, colors, sizes):
d_params['color'] = c
ax1.scatter(x[:,0], x[:,1], s=s, **d_params)
ax2.scatter(x[:,0], x[:,2], s=s, **d_params)
ax3.scatter(x[:,1], x[:,2], s=s, **d_params)
else:
ax = fig.add_subplot(111)
for x, c, s in zip(groups, colors, sizes):
d_params['color'] = c
ax.scatter(x[:,0], x[:,1], s=s, **d_params)
In [3]:
df_results = fft_window(power_standby.power, delta='3h', step='10min')
df_results_2 = fft_window(power_day14.power, delta='3h', step='10min')
# Resultados de all data:
results_FFT = fft_window(homog_power.power, delta='10min', step='5min')
print_ok(results_FFT.shape)
In [4]:
X = results_FFT.T.values
X_std = StandardScaler().fit_transform(X)
m_pca = PCA(n_components=2, copy=True, whiten=True)
#X_pca = m_pca.fit_transform(X_std)
X_pca = m_pca.fit_transform(X)
m_pca3 = PCA(n_components=3, copy=True, whiten=True)
#X_pca3 = m_pca3.fit_transform(X_std)
X_pca3 = m_pca3.fit_transform(X)
m_pca20 = PCA(n_components=20, copy=True, whiten=True)
#X_pca20 = m_pca20.fit_transform(X_std)
X_pca20 = m_pca20.fit_transform(X)
print(X.shape, X_std.shape, X_pca.shape, X_pca3.shape, X_pca20.shape)
In [85]:
axes = plot_tt_fft(power_day13.power, freqs=None, mags=None, max_freq=.1, best_freqs=0, figsize=(6, 5))
#axes = plot_tt_fft(power_day14.power, freqs=None, mags=None, max_freq=.1, best_freqs=5, axes=axes, delta='10min', step='5min')
freqs, mags = process_fft(power_day14.power)
axes = plot_tt_fft(power_day14.power, freqs, mags, max_freq=.1, best_freqs=3, axes=axes, params_plot_fft=dict(lw=2))
#axes=None, figsize=(8, 8), **params_window_fft
In [86]:
ax_t, ax_f = plot_tt_fft(power_day14.power, max_freq=.1, best_freqs=3, figsize=(6, 5),
delta='1h', step='10min', params_plot_fft=dict(lw=1, alpha=.1, c='k'))
#ax_f.set_ylim(0, 800)
plt.show()
In [1135]:
train = homog_power.loc['2016-09-08':'2016-09-21'].copy()
print_info('* Head:\n{}\n* Tail:\n{}\n* Describe:\n{}\nValores nulos:\n{}'
.format(train.head(3), train.tail(3), train.describe().T,
train[train.power==-1].groupby(lambda x: x.date).count()))
# Plot tt + fft:
plot_tt_fft(train.power, max_freq=.1, best_freqs=7, params_plot_t=dict(lw=.25, alpha=.8))
plot_tt_fft(train.power, max_freq=.1, params_plot_t=dict(lw=.5, alpha=.7), delta='10min', step='2min');
In [ ]:
In [ ]:
In [ ]:
In [124]:
df_fft_train_daily = fft_window(train.power, delta='24h', step='24h')
print_info(df_fft_train_daily.shape)
print_info(df_fft_train_daily.head())
plot_tt_fft(train.power, max_freq=.1, params_plot_t=dict(lw=.5, alpha=.7), delta='24h', step='24h');
In [171]:
# FFT 'features' por delta, cada step:
delta, step = '10min', '2min'
df_fft_train = fft_window(train.power, delta=delta, step=step)
print_info(df_fft_train.shape)
# Extracción de 'features' por delta, cada step
n_delta = int(pd.Timedelta(delta).total_seconds())
n_step = int(pd.Timedelta(step).total_seconds())
roll = train.power.rolling(n_delta)
df_tt_train = pd.DataFrame([roll.mean().shift(-(n_delta - 1)).iloc[::n_step].dropna().rename('r_mean'),
roll.median().shift(-(n_delta - 1)).iloc[::n_step].dropna().rename('r_median'),
roll.max().shift(-(n_delta - 1)).iloc[::n_step].dropna().rename('r_max'),
roll.min().shift(-(n_delta - 1)).iloc[::n_step].dropna().rename('r_min'),
roll.std().shift(-(n_delta - 1)).iloc[::n_step].dropna().rename('r_std')])
#df_tt_train.T.values
#df_tt_train.T.plot(lw=.75, alpha=.7)
df_train = df_tt_train.T.join(df_fft_train.T).T
df_train.head(10).T.head(3)
Out[171]:
In [172]:
X = df_train.T.values
m_ag15 = AgglomerativeClustering(n_clusters=15, linkage='average', affinity='manhattan')
m_ag15.fit(X)
labels_ag15 = pd.Series(m_ag15.labels_, name='k_ag', index=df_train.T.index)
pd.DataFrame(labels_ag15.value_counts()).T
Out[172]:
In [181]:
X_pca = PCA(n_components=2, whiten=True).fit_transform(df_train.T.values)
X_pca_fft = PCA(n_components=2, whiten=True).fit_transform(df_fft_train.T.values)
X_pca3 = PCA(n_components=3, whiten=True).fit_transform(df_train.T.values)
X_pca3_fft = PCA(n_components=3, whiten=True).fit_transform(df_fft_train.T.values)
In [1174]:
# TRAIN DATA: (event detection)
MARGEN_ABS = 50
ROLL_WINDOW_STD = 7
@timeit('process_data_for_event_detection', verbose=True)
def process_data_for_event_detection(train, kernel_size_wiener=15):
train_ev = train.copy()
train_ev['wiener'] = wiener(train_ev.power, kernel_size_wiener)
train_ev['delta_wiener'] = (train_ev.wiener - train_ev.wiener.shift()).fillna(0)
train_ev['abs_ch'] = train_ev['delta_wiener'].abs() > MARGEN_ABS
roll = train_ev['wiener'].rolling(ROLL_WINDOW_STD, center=True)
shift_roll = -(ROLL_WINDOW_STD // 2 + ROLL_WINDOW_STD % 2)
train_ev['r_std'] = roll.std().shift(shift_roll).fillna(method='ffill')
train_ev['r_mean'] = roll.mean().shift(shift_roll).fillna(method='ffill')
#train_ev.to_hdf('/Users/uge/Dropbox/PYTHON/PYPROJECTS/enerpi/notebooks/train.h5', 'power')
train_ev.info()
return train_ev
train_ev = process_data_for_event_detection(train)
train_ev.head()
Out[1174]:
In [1175]:
# Subset of train data:
df_1 = train_ev.loc['2016-09-10'].between_time('8:30', '10:30')
df_2 = train_ev.loc['2016-09-08'].between_time('8:00', '14:00')
t0, tf = '2016-09-10 19:30', '2016-09-10 23:40'
df_3 = train_ev.loc[t0:tf]
t0, tf = '2016-09-10 20:30', '2016-09-10 22:50'
df_4 = train_ev.loc[t0:tf]
dfs_row = [df_1, df_2, df_3, df_4]
f, axes = plt.subplots(len(dfs_row) // 2, len(dfs_row) // 2, figsize=(18, 12))
for ax, df in zip(axes.ravel(), dfs_row):
df.power.plot(ax=ax, lw=.25, color=tableau20[0], alpha=.4)
df.wiener.plot(ax=ax, lw=.75, color=tableau20[0], alpha=.7)
ax.xaxis.set_major_formatter(mpd.DateFormatter('%H:%M:%S'))
ax.set_xlabel('')
f.tight_layout()
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [1180]:
FRAC_PLANO_INTERV = .9 # fracción de elementos planos (sin ∆ apreciable) en el intervalo
N_MIN_FRAC_PLANO = 10 # Nº mínimo de elems en intervalo para aplicar FRAC_PLANO_INTERV
P_OUTLIER = .95
P_OUTLIER_SAFE = .99
MIN_EVENT_T = 6
MIN_VALIDOS = 3
MIN_STD = 20
def _es_outlier(pnorm, p_limit=P_OUTLIER):
if (pnorm < 1 - p_limit) or (pnorm > p_limit):
return True
return False
def _cambio_futuro(std1, std2, m1, m2):
d = abs(m1 - m2)
m = (m1 + m2) / 2.
dc = abs(std1 - std2)
mc = (std1 + std2) / 2.
#if (((std1 < 50) and (std2 > 100)) or ((std2 < 50) and (std1 > 100))) and ((d > 1000) or (d / m > 1)):
if ((dc > 100) or (dc / mc > 5)) or ((d > 1000) or (d / m > 1)):
return True
return False
# Detección de intervalos
@timeit('rect_smoothing', verbose=True)
def rect_smoothing(df, verbose=False):
N = df.shape[0]
is_start_event = np.zeros(N, dtype=np.bool)
change = np.zeros(N, dtype=np.int8)
step = np.zeros(N, dtype=np.int)
step_med = np.zeros(N, dtype=np.int)
temp = np.zeros((N, 13))
counter_deb = 0
cols_deb = 'n, n_planos, mean_t, std_t, mean_planos, std_planos, mean_final, std_final, mean_u, std_u, pnorm, pnorm_next, error_cum'.split(', ')
print_ok(cols_deb)
c_interval = c_interval_ch = None
last_idx_ch = last_step = last_std = n = 0
idx_ch_ant = 0
f_planos = lambda interval, interval_ch: np.array(interval)[np.nonzero(np.array(interval_ch) == 0)]
for i, (p, hay_ch_abs, incr, next_incr, next_std, next_mean
) in enumerate(zip(df['wiener'], df['abs_ch'],
df['delta_wiener'].shift(-1), df['delta_wiener'].shift(-2),
df['r_std'], df['r_mean'])):
sufficient_event = n > MIN_EVENT_T
hay_ch_min = (abs(incr) > 15) or (abs(incr + next_incr) > 30)
if i == 0:
c_interval, c_interval_ch = [p], [0]
c_interval_ant, c_interval_ch_ant = [p], [0]
n += 1
elif i == N - 3:
print('last')
c_interval.append(p)
c_interval_ch.append(hay_ch_min)
last_step = np.mean(c_interval)
step[last_idx_ch:] = np.int(np.mean(c_interval))
step_med[last_idx_ch:] = np.int(np.median(c_interval))
change[last_idx_ch:-2] = np.array(c_interval_ch, dtype=np.int8)
elif not sufficient_event:
c_interval.append(p)
c_interval_ch.append(hay_ch_min)
n += 1
else:
dir_ch = 0
if hay_ch_abs:
dir_ch = 1 if incr > 0 else -1
v_planos = f_planos(c_interval, c_interval_ch)
n_planos = v_planos.shape[0]
std_t = np.std(c_interval[2:])
mean_t = np.mean(c_interval[2:])
err_dist = std_t / np.sqrt(n - 2)
deb_hay_planos = deb_usa_final = deb_usar = 0
if n_planos > 3:
deb_hay_planos = 1
std_planos = np.std(v_planos)
mean_planos = np.mean(v_planos)
err_dist_p = std_planos / np.sqrt(n_planos)
else:
std_planos = std_t
mean_planos = mean_t
err_dist_p = err_dist
if n_planos > 2 * ROLL_WINDOW_STD:
deb_usa_final = 1
std_final = np.std(v_planos[-(2 * ROLL_WINDOW_STD):])
mean_final = np.mean(v_planos[-(2 * ROLL_WINDOW_STD):])
err_dist_f = std_final / np.sqrt(len(v_planos[-(2 * ROLL_WINDOW_STD):]))
else:
std_final = std_planos
mean_final = mean_planos
err_dist_f = err_dist_p
if (n > N_MIN_FRAC_PLANO) and (n_planos / n > FRAC_PLANO_INTERV):
deb_usar = 1
#std_u = std_planos
#mean_u = mean_planos
#err_dist_u = err_dist_p
std_u = std_final
mean_u = mean_final
err_dist_u = err_dist_f
#elif (n > N_MIN_FRAC_PLANO):
# deb_usar = 2
# std_u = std_final
# mean_u = mean_final
# err_dist_u = err_dist_f
else:
deb_usar = 3
std_u = std_t
mean_u = mean_t
err_dist_u = err_dist
# Condición de intervalo diferente al anterior para k y k+1:
pnorm = scipy.stats.norm.cdf(p, mean_u, min(150, std_u + err_dist_u))
pnorm_next = scipy.stats.norm.cdf(p + next_incr, mean_u, min(150, std_u + err_dist_u))
# DEBUG
interv_considerados = np.array(c_interval[2:])
error_cum = np.sqrt(np.mean((interv_considerados - np.mean(interv_considerados))**2))
temp[i, :] = [n, n_planos, mean_t, std_t, mean_planos, std_planos, mean_final, std_final, mean_u, std_u, pnorm, pnorm_next, error_cum]
# Lógica de detección de eventos:
next_dif_mean_supera_std = abs(mean_u - next_mean) > std_u
es_outlier_safe = _es_outlier(pnorm, P_OUTLIER_SAFE)
es_outlier = _es_outlier(pnorm, P_OUTLIER_SAFE)
next_es_outlier_safe = _es_outlier(pnorm_next, P_OUTLIER_SAFE)
next_es_outlier = _es_outlier(pnorm_next, P_OUTLIER_SAFE)
hay_cambio_futuro = _cambio_futuro(std_final, next_std, mean_final, next_mean)
hay_incr_considerar = abs(incr) > 10
vip = ((hay_ch_abs and next_dif_mean_supera_std and es_outlier and next_es_outlier_safe)
or (hay_incr_considerar and es_outlier and next_es_outlier_safe and hay_cambio_futuro))
if sufficient_event and vip: # and (c_interval_ch[-1] == 0) and (c_interval_ch[-2] == 0):
# debug:
if verbose:
print_magenta('\n{} --> {:.0f}, {:.0f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.7f}, {:.7f}'.format(counter_deb, *temp[i,:]))
print(incr, hay_incr_considerar, hay_ch_abs, next_dif_mean_supera_std, es_outlier, es_outlier_safe, next_es_outlier, next_es_outlier_safe, hay_cambio_futuro)
cad_deb = 'deb_hay_planos={}, deb_usa_final={}, deb_usar={}'.format(deb_hay_planos, deb_usa_final, deb_usar)
if hay_ch_abs and _es_outlier(pnorm, P_OUTLIER_SAFE) and _es_outlier(pnorm_next, P_OUTLIER_SAFE):
print_info('* p={:.0f} W, salto ch_abs={:.1f} y safe_outlier: [{:.5f}, {:.5f}]; {}, error_cum={:.2f}'
.format(p, incr, pnorm, pnorm_next, cad_deb, error_cum))
else:
print_cyan('con p={:.0f} W, salto por cambio_futuro. std: {:.1f} -> {:.1f}, mean: {:.1f} -> {:.1f}; y safe_outlier: [{:.7f}, {:.7f}]; {}, error_cum={:.2f}'
.format(p, std_final, next_std, mean_final, next_mean, pnorm, pnorm_next, cad_deb, error_cum))
print_red('{:%H:%M:%S} => m_u={:.1f}, std_u={:.1f}, e_u={:.1f}, last_step={:.1f}, last_std={:.1f}, roll_std={:.1f}, p={:.7f}'
.format(df.index[i], mean_u, std_u, err_dist_u, last_step, last_std, next_std, pnorm))
#if counter_deb > 50:
# break
#if n_planos > 5:
# last_step = np.median(v_planos)
# last_std = np.std(v_planos)
#else:
# last_step = np.median(c_interval)
# last_std = np.std(c_interval)
last_step = np.mean(c_interval)
last_std = np.std(c_interval)
is_start_event[i] = True
step[last_idx_ch:i] = np.int(np.mean(c_interval))
step_med[last_idx_ch:] = np.int(np.median(c_interval))
change[last_idx_ch:i] = np.array(c_interval_ch, dtype=np.int8)
idx_ch_ant = last_idx_ch
c_interval_ant, c_interval_ch_ant = c_interval, c_interval_ch
c_interval, c_interval_ch = [p], [vip]
last_idx_ch = i
n = 1
counter_deb += 1
error_cum = 0
else:
#if (counter_deb == 4) and (i % 3 == 0):
# print_red('{} [{:%H:%M:%S}], {:.0f} W --> {:.0f}, {:.0f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.0f}, {:.1f}, {:.7f}, {:.7f}'
# .format(counter_deb, df.index[i], p, *temp[i,:]))
c_interval.append(p)
c_interval_ch.append(hay_ch_min)
n += 1
print(N, i)
df_step = pd.DataFrame({'step_median': step_med, 'step_mean': step, 'ch': change, 'is_init': is_start_event},
columns=['step_median', 'step_mean', 'ch', 'is_init'], index=df.index)
df_debug = pd.DataFrame(temp, columns=cols_deb, index=df_step.index)
return df_step, df_debug
def _std_nonzero(x):
N = x.shape[0]
#idx_act = np.nonzero(x.ch == 0)[0]
#interv.ix[np.nonzero(x.ch)[0][-1] + 1:].wiener.std()
idx_act = np.nonzero(x.ch)[0]
if len(idx_act) == 0:
#print_red(x.ch)
return np.std(x.wiener)
elif idx_act[-1] + 1 < N:
v_usar = x.wiener.values[idx_act[-1] + 1:]
#print(N, idx_act[-1] + 1, len(v_usar))
return np.std(v_usar)
else:
#print(N, idx_act, '\n', x)
return -1
def _valid_ch(interv_ch):
idx_act = np.nonzero(interv_ch)[0]
if (len(idx_act) == 0) or (idx_act[-1] + 1 < len(interv_ch)):
return True
else:
return False
def genera_df_intervalos(df_data, df_step):
train_step_t = df_data[['wiener']].join(df_step)
train_step_t['interv_shift'] = train_step_t.is_init.cumsum().shift(-1).fillna(method='ffill').astype(int)
gb = train_step_t.tz_localize(None).reset_index().groupby('interv_shift')
df_interv = pd.concat([gb.ch.apply(lambda x: sum(abs(x))).rename('n_ch'),
gb.ch.sum().rename('abs_ch'),
gb.ch.count().rename('n'),
gb.wiener.median().rename('median_all').round(),
gb.wiener.std().rename('std_all').round(1),
gb.step_median.last().rename('delta') - gb.step_median.first().values,
gb.step_median.last().rename('step_v'),
gb.ch.apply(_first_index_permanent).rename('rp'),
gb.ts.first(),
gb.ch.apply(_valid_ch),
gb.wiener.apply(lambda x: np.std(x[5:])),
gb.apply(_std_nonzero).round(1).rename('std_c')], axis=1)
return df_interv
def plot_steps_detection(df_data, df_step):
ax = df_data.wiener.plot(figsize=(18, 6), lw=.5, alpha=.3, color=tableau20[0])
df_step.step_median.plot(ax=ax, lw=1, alpha=.9, color=tableau20[4])
(df_step.ch.abs() * 500).plot(ax=ax, lw=.75, alpha=.3, color=tableau20[6])
ax.xaxis.set_major_formatter(mpd.DateFormatter('%H:%M:%S'))
return ax
# Detección de intervalos
df = df_1.between_time('8:51', '9:02')
df_step, df_debug = rect_smoothing(df, verbose=False)
# INTERVALOS:
df_interv = genera_df_intervalos(df, df_step)
# Show
plot_steps_detection(df, df_step)
df_interv.tail(15)
Out[1180]:
In [1181]:
df = df_2.between_time('13:45', '15:30')
df_step, df_debug = rect_smoothing(df, verbose=True)
df_interv = genera_df_intervalos(df, df_step)
plot_steps_detection(df, df_step)
df_interv.tail(15)
Out[1181]:
In [1182]:
df = df_3
df_step, df_debug = rect_smoothing(df, verbose=False)
df_interv = genera_df_intervalos(df, df_step)
plot_steps_detection(df, df_step)
df_interv.head(15)
Out[1182]:
In [1183]:
df = df_4
df_step, df_debug = rect_smoothing(df, verbose=False)
df_interv = genera_df_intervalos(df, df_step)
plot_steps_detection(df, df_step)
df_interv.head(15)
Out[1183]:
In [ ]:
In [1076]:
ax = df_debug.mean_t.plot(lw=.5)
df_debug.std_t.plot(ax=ax, lw=.75)
df_debug.std_planos.plot(ax=ax, lw=.75)
#df_debug.plot(ax=ax, lw=.75)
(df_debug.error_cum).plot(ax=ax, lw=1)
ax.xaxis.set_major_formatter(mpd.DateFormatter('%H:%M:%S'))
In [ ]:
In [1218]:
pd.read_hdf('/Users/uge/Desktop/bkp_enerpidata/OLD_STORES/enerpi_data.h5', 'rms')
Out[1218]:
In [1217]:
#pd.read_hdf('/Users/uge/Desktop/bkp_enerpidata/OLD_STORES/enerpi_data.h5', 'rms')
with pd.HDFStore('/Users/uge/Desktop/bkp_enerpidata/OLD_STORES/enerpi_data.h5', 'r') as st:
print_info(st)
bad = st.select('rms', start=0, stop=324324)
print_ok(bad.head())
bad.tail()
Out[1217]:
In [ ]:
bad = st.select
In [ ]:
bad = st.get
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Para la detección de eventos y niveles, partiendo de la señal de 'power' filtrada con un filtro wiener, se analizan los cambios en el tiempo, suponiendo saltos de nivel mediante la rectangularización de la señal.
Se marcan los instantes de cambio (subida o bajada), y se utiliza como valor del nivel la mediana de los instantes sin cambio dentro de cada intervalo (periodos de señal aproximadamente constante, conforme a un intervalo de probabilidad del 90%, suponiendo una distribución normal de los valores dentro del intervalo (-> media + std).
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [893]:
interv = train_step_t[train_step_t.interv_shift == 1]
#interv.ix[np.nonzero(interv.ch == 0)]
interv
interv.ix[np.nonzero(interv.ch)[0][-1] + 1:].wiener.std(), interv.ix[np.nonzero(interv.ch == 0)].wiener.std(), interv.wiener.std()
Out[893]:
In [690]:
ax = df_test.wiener.plot(lw=.5, figsize=(16, 6))
df_test.wiener.rolling(7).std().plot(lw=.5, ax=ax)
df_test.wiener.rolling(5, center=True).std().plot(lw=.5, ax=ax)
#df_test.wiener.rolling(15, center=True).std().plot(lw=.5, ax=ax)
Out[690]:
In [ ]:
In [ ]:
In [632]:
plot_tt_fft(df_test.iloc[1000:-700].wiener, max_freq=5, best_freqs=10)
Out[632]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [570]:
c_interval = train.loc[df_step[df_step.interv_shift == n_interv].index, 'wiener'].tolist()
c_interval_ch = df_step[df_step.interv_shift == n_interv].ch.tolist()
%memit validos = [v for i, v in enumerate(c_interval) if c_interval_ch[i] == 0]
%timeit validos = [v for i, v in enumerate(c_interval) if c_interval_ch[i] == 0]
%memit validos_np = np.array(c_interval, dtype=np.int)[np.nonzero(np.array(c_interval_ch, dtype=np.int8) == 0)]
%timeit validos_np = np.array(c_interval, dtype=np.int)[np.nonzero(np.array(c_interval_ch, dtype=np.int8) == 0)]
print(len(c_interval), len(validos), np.mean(validos), len(validos_np), np.mean(validos_np))
In [ ]:
In [481]:
mean = 1997
std = 40
offset = 300
p = .9
x_p = np.linspace(mean - offset, offset + mean, 2*offset + 1)
curva_p = scipy.stats.norm.pdf(x_p, mean, std)
plt.plot(x_p, curva_p)
plt.vlines(scipy.stats.norm.interval(p, mean, std / 10), 0, np.max(curva_p), color='r', lw=2, alpha=.5)
plt.vlines(scipy.stats.norm.interval(p, mean, std), 0, np.max(curva_p), color='y', lw=2, alpha=.5)
plt.vlines(scipy.stats.norm.interval(p, mean, 5 * std), 0, np.max(curva_p), color='g', lw=2, alpha=.5)
Out[481]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [260]:
def _f_roll_difs(x):
if (np.sum(x) > 50) and (x[len(x) // 2 + 1] == np.max(x)):
return x[len(x) // 2 + 1] - (x[0] + x[-1]) / 2
return 0
df_1.wiener.diff().rolling(5, center=True).apply(_f_roll_difs).plot(lw=.75)
Out[260]:
In [ ]:
In [ ]:
In [ ]:
In [1130]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [178]:
m_hd = HDBSCAN(min_cluster_size=5, min_samples=None, p=.5)
m_hd.fit(X)
labels_hd = pd.Series(m_hd.labels_, name='k_hd', index=df_train.T.index)
print_info(pd.DataFrame(labels_hd.value_counts()).T)
plot_clustering(X, m_hd.labels_)
Out[178]:
In [177]:
m_hd_fft = HDBSCAN(min_cluster_size=5, min_samples=None, p=.5)
m_hd_fft.fit(df_fft_train.T.values)
labels_hdfft = pd.Series(m_hd_fft.labels_, name='k_hd_fft', index=df_fft_train.T.index)
print_info(pd.DataFrame(labels_hdfft.value_counts()).T)
plot_clustering(df_fft_train.T.values, m_hd_fft.labels_)
Out[177]:
In [179]:
std_scaler = StandardScaler()
X_std_fft = std_scaler.fit_transform(df_fft_train.T.values)
m_ag_fft = AgglomerativeClustering(n_clusters=15, linkage='average', affinity='manhattan')
m_ag_fft.fit(X_std_fft)
labels_ag = pd.Series(m_ag.labels_, name='k_ag_fft', index=df_fft_train.T.index)
print_info(pd.DataFrame(labels_ag.value_counts()).T)
plot_clustering(X_std_fft, m_hd_fft.labels_)
Out[179]:
In [163]:
plot_clustering(X_std[:,:50], m_ag.labels_, title=None,
core_samples_mask=None, cluster_centers_indices=None, cluster_centers=None, force_scatter=False,
ax=None)
Out[163]:
In [192]:
_plot_reduced_clustering(X_pca3_fft, m_ag_fft.labels_, lw=.25, s=40)
_plot_reduced_clustering(X_pca_fft, m_ag_fft.labels_, lw=.25, s=40)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [318]:
#_plot_reduced_clustering(X_pca3, axes3d=True)
_plot_reduced_clustering(X_pca3)
_plot_reduced_clustering(X_pca)
_plot_reduced_clustering(X_pca20[:,3:])
In [10]:
tsne = TSNE(n_components=2)
X_tsne20 = tsne.fit_transform(X_pca20)
X_tsne20
Out[10]:
In [18]:
_plot_reduced_clustering(X_tsne20)
In [319]:
m_hd = HDBSCAN(min_cluster_size=15, min_samples=5, core_dist_n_jobs=8)
m_hd.fit(X_pca20)
_plot_reduced_clustering(X_pca20, m_hd.labels_)
_show_info_clustering(X_pca20, m_hd.labels_)
In [320]:
_plot_reduced_clustering(X_pca, m_hd.labels_)
In [321]:
m_hd = HDBSCAN(min_cluster_size=10, min_samples=3, core_dist_n_jobs=8)
m_hd.fit(X_pca20)
_plot_reduced_clustering(X_pca20, m_hd.labels_)
_show_info_clustering(X_pca20, m_hd.labels_)
In [322]:
m_hd = HDBSCAN(min_cluster_size=10, min_samples=3, core_dist_n_jobs=8)
m_hd.fit(X_std)
_plot_reduced_clustering(X_std, m_hd.labels_)
_show_info_clustering(X_std, m_hd.labels_, labels_true=None)
In [323]:
m_hd = HDBSCAN(min_cluster_size=10, min_samples=3, core_dist_n_jobs=8)
m_hd.fit(X)
_plot_reduced_clustering(X_pca3, m_hd.labels_)
_show_info_clustering(X, m_hd.labels_)
In [324]:
m_pca50 = PCA(n_components=50, whiten=False)
X_pca50 = m_pca50.fit_transform(X)
X_pca50.shape
Out[324]:
In [325]:
m_hd = HDBSCAN(min_cluster_size=5, min_samples=5, core_dist_n_jobs=8)
m_hd.fit(X_pca50)
_plot_reduced_clustering(X_pca, m_hd.labels_)
_show_info_clustering(X_pca50, m_hd.labels_)
In [339]:
std_freqs = results_FFT.std(axis=1)
cond_freq = std_freqs > .02 * std_freqs.max()
print(results_FFT.shape, results_FFT[cond_freq].shape)
results_FFT[cond_freq][results_FFT.columns[:500]].plot(lw=.5, legend='', alpha=.1, c='k')
Out[339]:
In [335]:
Out[335]:
In [90]:
pd.Series(m_hd.probabilities_).hist(bins=200, lw=0, log=True)
Out[90]:
In [266]:
In [ ]:
In [277]:
pd_ts_df = ax_t.xaxis.get_major_formatter()
pd_ts_df.
Out[277]:
In [267]:
freqs1, mags1 = process_fft(power_standby.power, with_freq=True)
freqs2, mags2 = process_fft(power_day14.power, with_freq=True)
ax_t, ax_f = plot_tt_fft(power_standby.power, freqs1, mags1)
ax_t
#_plot_tt_fft(tt, x, df_fft, best_fft, log_mag=log_mag, figsize=figsize)
Out[267]:
In [191]:
np.concatenate?
In [86]:
# Divide results_FFT de cluster=k en intervalos consecutivos:
def _group_intervals(df, step=pd.Timedelta('5min')):
groups = []
t_ant = df.index[0]
start = df.index[0]
end = start + step
for t in df.index[1:]:
if t - end > 1.1 * step:
groups.append(df.loc[start:end])
start, end = t, t + step
else:
end += step
print_info('LEN={}, GROUPS={}'.format(len(df), len(groups)))
return groups
g_nulos = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 31])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g_nulos];
In [84]:
def plot_muestra_tt(alldata, clustered_intervals, idx_samples=(0, 1, 2, 3, 4, 5), offset=pd.Timedelta('10min')):
nr = len(idx_samples) // 3
if len(idx_samples) % 3 != 0:
nr += 1
fig = plt.figure(figsize=(18, 5*nr))
for i, df_fft in enumerate([clustered_intervals[idx] for idx in idx_samples]):
ax = fig.add_subplot(nr, 3, i+1)
t0, tf = df_fft.index[0], df_fft.index[-1]
df_tt = homog_power.loc[t0:tf+offset]
df_tt.plot(ax=ax, lw=1, color=tableau20[4], alpha=.8, legend='')
ax.xaxis.tick_bottom()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0, ha='center')
ax.xaxis.set_tick_params(labelsize=8, pad=2)
ax.grid('on', axis='x')
ax.xaxis.set_major_formatter(mpd.DateFormatter('%H:%M'))
ax.xaxis.set_major_locator(mpd.MinuteLocator(range(0, 60, 5)))
ax.xaxis.set_minor_locator(mpd.MinuteLocator())
ax.annotate('{:%d-%b}, (cont:{})'.format(t0, len(df_fft)), (.98, .98), xycoords='axes fraction',
fontsize=8, color=tableau20[2], va='top', ha='right')
In [87]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 20])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1, 2, 3, 4, 5, 6, 7, 8));
In [88]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 17])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1, 2, 3, 4, 5));
In [94]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 15])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1, 2, 3, 4));
In [97]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 16])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1, 2, 3));
In [101]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 12])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1));
In [107]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 2])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, (0, 1, 2, 3, 4, 5));
In [110]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 3])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
plot_muestra_tt(homog_power, g, [0]);
In [116]:
g = _group_intervals(results_FFT.T.loc[m_hd.labels_ == 30])
[print_red('{}, de {} a {}'.format(df.shape, df.index[0], df.index[-1])) for df in g]
#plot_muestra_tt(homog_power, g, [0, 1, 2, 3, 4]);
Out[116]:
In [143]:
grupos = [_group_intervals(results_FFT.T.loc[m_hd.labels_ == k]) for k in set(m_hd.labels_)]
grupos[0][0]
Out[143]:
In [124]:
plot_muestra_tt(homog_power, grupos[-7], [0, 1, 2, 3, 4]);
In [140]:
ax = results_FFT.T.iloc[1000].plot(lw=.5, alpha=.8)
ax2 = plt.axes([.6, .6, .3, .3])
#ax2 = plt.subplot(position=[[0.125, 0.3], [0.9, 0.7]])
ax2.get_position()
Out[140]:
In [138]:
ax1 = plt.axes([0, 0, 1, 1])
ax2 = plt.axes([.6, .6, .3, .3])
In [92]:
pd.Series(m_hd.labels_).hist(bins=len(set(m_hd.labels_)), lw=0, log=True)
Out[92]:
In [ ]:
In [21]:
plt.figure(figsize=(16, 9))
ax = plt.subplot(121)
results_FFT.loc[:.006].plot(ax=ax, lw=.5, color='k', alpha=.01)
plt.legend([])
ax = plt.subplot(122)
results_FFT.loc[.006:].plot(ax=ax, lw=.5, color='k', alpha=.1)
plt.legend([])
Out[21]:
In [22]:
k = 14
#m_km = KMeans(k, n_jobs=-1).fit(X)
#m_km
In [69]:
#df_fft_all.T
#pd.Series(m_km.labels_).value_counts()
plt.figure(figsize=FS)
ax = plt.subplot(111)
for ki in range(k):
df_fft_all.T.iloc[m_km.labels_ == ki].iloc[:50].T.plot(ax=ax, lw=.5, c=next(cm), alpha=.05, legend='')
plt.xlim(0, .15)
plt.ylim(0, 150)
Out[69]:
In [80]:
f, axes = plt.subplots(6, 3, figsize=(12, 24))
for ki, ax in zip(range(k), axes.ravel()):
#ax = plt.subplot(6, 3, k+1)
sub_data = df_fft_all.T.iloc[m_km.labels_ == ki]
n = len(sub_data)
color = next(cm)
#sub_data.T.plot(ax=ax, lw=.5, c=color, alpha=1.2/n, legend='')
sub_data.iloc[:2].T.plot(ax=ax, lw=.5, c=color, alpha=.5, legend='')
ax.set_xlim((0, .15))
ax.set_ylim((0, 150))
ax.grid('on', 'x')
ax.xaxis.tick_bottom()
ax.annotate('K={}, {} members'.format(ki, n), (.5, .98), xycoords='axes fraction',
color=color, fontsize=12, va='top', ha='center')
plt.show()
In [83]:
m_hd = HDBSCAN().fit(X)
m_hd
Out[83]:
In [23]:
#pd.Series(m_hd.labels_).value_counts(), m_hd.labels_.max()
In [89]:
f, axes = plt.subplots(6, 3, figsize=(12, 24))
for ki, ax in zip(range(-1, k), axes.ravel()):
#ax = plt.subplot(6, 3, k+1)
sub_data = df_fft_all.T.iloc[m_hd.labels_ == ki]
n = len(sub_data)
color = next(cm)
sub_data.T.plot(ax=ax, lw=.5, c=color, alpha=1.2/n, legend='')
#sub_data.iloc[:2].T.plot(ax=ax, lw=.5, c=color, alpha=.5, legend='')
ax.set_xlim((0, .15))
ax.set_ylim((0, 150))
ax.grid('on', 'x')
ax.xaxis.tick_bottom()
ax.annotate('K={}, {} members'.format(ki, n), (.5, .98), xycoords='axes fraction',
color=color, fontsize=12, va='top', ha='center')
plt.show()
In [113]:
ki = 10
sub_data = df_fft_all.T.iloc[m_hd.labels_ == ki]
sub_data.T.plot(c='k', alpha=1/len(sub_data), lw=.25, legend='')
Out[113]:
In [114]:
homog_power.loc[sub_data.index[-1]:sub_data.index[-1] + pd.Timedelta('3h')].plot(lw=.5)
Out[114]:
In [ ]:
In [116]:
k = 36
labels_pca, n_clusters_pca, model_pca, X_pca = pca_clustering(X, k, whiten=True, labels_true=None)
print(n_clusters_pca)
plot_clustering_figure(X, labels_pca, plot_clustering_as_voronoi, X_pca, model_pca)
In [ ]:
X = results_FFT.T.values
X_std = StandardScaler().fit_transform(X)
In [ ]:
In [ ]: